{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<b>Steps:</b>\n",
    "\n",
    "1. Use the New York Times API to fetch the new articles based on the keywords\n",
    "2. Iterate and save all the urls\n",
    "3. Hit the urls seperately and fetch the html content\n",
    "4. Use the BeautifulSoap to parse the html\n",
    "4. Get the content of the url and save the whole content in a single file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import sys\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import os\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!{sys.executable} -m pip install beautifulsoup4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_tag(raw_html):\n",
    "    cleanr = re.compile('<.*?>')\n",
    "    cleantext = re.sub(cleanr, '', raw_html)\n",
    "    return cleantext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data(keyword, folder_name, file_count):\n",
    "    \n",
    "    web_url = []\n",
    "    for i in range(0,100): \n",
    "        params = {\n",
    "             'api-key': '', #ToDo: Add your api-key\n",
    "             'begin_date': '', #ToDo: Add start date\n",
    "             'end_date': '', #ToDo: Add end date\n",
    "             'q': keyword,\n",
    "             'page' : str(i),\n",
    "             'fl': \"web_url\",\n",
    "        }\n",
    "        # Sleep for 1 second so that you dont exceed the NY api hit limit per second\n",
    "        time.sleep(1)\n",
    "        nytimes_response = requests.get(url='https://api.nytimes.com/svc/search/v2/articlesearch.json', params= params)\n",
    "        str_nytimes_response = nytimes_response.content.decode(\"utf-8\") \n",
    "        json_nytimes_response = json.loads(str_nytimes_response)\n",
    "        print(\"iteration: \"+ str(i))\n",
    "        \n",
    "        # iterate the respone and if docs is present inside response then append the web url\n",
    "        if(json_nytimes_response['response']['docs'] != []):\n",
    "            for i in json_nytimes_response['response']['docs']:\n",
    "                web_url.append(i['web_url'])\n",
    "            print(len(web_url))\n",
    "        else :\n",
    "            break\n",
    "    # For each url obtained hit the url and get the content         \n",
    "    for url in web_url:\n",
    "        url_response = requests.get(url)\n",
    "        # Use BeautifulSoup to parse the html\n",
    "        soup = BeautifulSoup(url_response.content, 'html.parser')\n",
    "        \n",
    "        # Get the story content from the paragraph. Note this depends on the websites and for new york times it was mostly found to be story-content\n",
    "        story = soup.findAll(\"p\", class_=\"story-content\")\n",
    "        e2kc3sl0 = soup.findAll(\"p\", class_=\"e2kc3sl0\")\n",
    "\n",
    "        web_text = ''\n",
    "\n",
    "        # Append the various paragraphs and make it a complete content\n",
    "        for text in story:\n",
    "            temp = remove_tag(str(text))\n",
    "            web_text = web_text+\" \"+temp\n",
    "            \n",
    "        for e in e2kc3sl0:\n",
    "            t = remove_tag(str(e))\n",
    "            web_text = web_text+\" \"+t\n",
    "        \n",
    "        file_count = file_count + 1;\n",
    "        #Save the file in the notation [end_data]_[file_count].txt\n",
    "        file_name = keyword+params['end_date']+\"_\"+str(file_count)+\".txt\"\n",
    "        \n",
    "        print(\"File Name: \"+file_name)\n",
    "        print(\"url: \"+url)\n",
    "        \n",
    "        filepath = os.path.join(os.getcwd()+'/'+folder_name, file_name)\n",
    "        if not os.path.exists(os.getcwd()+'/'+folder_name):\n",
    "            os.makedirs(os.getcwd()+'/'+folder_name)\n",
    "        f = open(filepath, \"w\")\n",
    "        f.write(web_text)\n",
    "        f.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keywords = [\"blockchain\",\"ethereum\",\"r3 corda\",\"bitcoin\",\"hyperledger\",\"crypto\", \"initial coin offering\",\"ether\",\"dapp\",\"litecoin\"]\n",
    "for key in keywords:\n",
    "    get_data(key,\"\",0) # toDo: Enter folder Name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
